/* Table 6 Column 1: bachelor-skewness regression with ravenpack */
libname edu '!userprofile\\Dropbox\Education\Replication\Data';

* get data;
proc sql;
  create table regression2 as
  select distinct a.log_bachelor, a.year_ending, a.majorcode, b.major
  from edu.enrolment as a, edu.skew as b
  where a.majorcode = b.major;
quit;

*get industry controls and skewness;
proc sql;
  create table regression2 as
  select distinct a.*, b.log_mktcape_major, b.log_btme_major, b.log_agee_major
  from regression2 as a, edu.skew as b
  where a.year_ending = b.year+3 and a.major = b.major;
quit;

proc sql;
  create table enrolment_skew_return2 as
  select a.*, b.ravenskew1e_major
  from regression2 as a, edu.skew as b
  where 3 <= a.year_ending - b.year <= 7 and a.majorcode = b.major and a.major = b.major and a.major ~= .;
quit;

data enrolment_skew_return2; set enrolment_skew_return2; if ravenskew1e_major = . then delete; run;

* calculate the average;
proc sort data = enrolment_skew_return2; by year_ending majorcode major; run;

proc means data = enrolment_skew_return2 noprint;
  by year_ending majorcode major;
  var ravenskew1e_major;
  output out = return2 mean = ravenskew1e_major_avg2;
run;

proc sql;
  create table regression2 as
  select a.*, b.ravenskew1e_major_avg2
  from regression2 as a, return2 as b
  where a.year_ending = b.year_ending and a.majorcode = b.majorcode and a.major = b.major and a.major ~= .;
quit;

* add time-series measures;
proc sql;
  create table temp as
  select distinct a.*, b.tsmean5_major as tsmean_major_avg2, b.tsvol5_major as tsvol_major_avg2
  from regression2 as a left join edu.skew as b
  on a.year_ending = b.year+3 and a.major = b.major;
quit;

data regression_avg2; set temp; run;

/* merge with wage data (average industry entry-level wage and average major entry-level wage) */
data wage (keep = year major annual_wage);
  set edu.wagesoc;
  if year < 1999 then delete;
run;

data wage2 (keep = year major annual_wage);
  set edu.compustat_wage;
  if year >= 1999 then delete;
  annual_wage = annual_wagee_major;
run;

proc append base = wage2 data = wage; run;

proc sql;
  create table regression_avg2 as
  select a.*, b.annual_wage as annual_wage_avg2, b.year
  from regression_avg2 as a left join wage2 as b
  on a.majorcode = b.major and 
  a.year_ending - b.year = 3;
quit;

data regression_avg2; set regression_avg2; if majorcode = 12 or majorcode = 21 then delete; run;

* year fixed effects, major fixed effects, cluster std error by year;
proc standard data = regression_avg2 out = regression_avg3 mean = 0 std = 1; run;

data regression_avg2; set regression_avg2; num = _N_; run;
data regression_avg3 (drop = log_bachelor year_ending majorcode); set regression_avg3; num = _N_; run;

proc sql;
  create table regression_avg2 as
  select a.*, b.log_bachelor, b.year_ending, b.majorcode
  from regression_avg3 as a, regression_avg2 as b
  where a.num = b.num;
quit;

proc surveyreg data = regression_avg2;
  class year_ending majorcode;
  cluster year_ending;
  model log_bachelor = 

  ravenskew1e_major_avg2
  tsmean_major_avg2
  tsvol_major_avg2
  annual_wage_avg2
  log_mktcape_major
  log_btme_major
  log_agee_major

  year_ending majorcode/solution adjrsq;
run;
